Note: Raw data can be found in the GEO entry, but decided to use already normalised version available in Gandal’s github repository for now.
datMeta = read.csv('./../Data/Chow/Chow_GSE28475_datMeta.csv')
datExpr = read.delim('./../Data/Chow/Chow_GSE28475_quantile_normalized.txt')
# Filter datExpr columns
rownames(datExpr) = datExpr[,1]
datExpr = datExpr[,-1]
datExpr = datExpr[,seq(1, 65, by=2)]
colnames(datExpr) = gsub('[^0-9.-]','', colnames(datExpr))
idx = match(colnames(datExpr), datMeta$SampleID)
datMeta = datMeta[idx,]
# Annotate Probes
ensembl = useMart('ENSEMBL_MART_ENSEMBL', dataset='hsapiens_gene_ensembl', host='feb2014.archive.ensembl.org')
getinfo = c('illumina_humanref_8_v3', 'ensembl_gene_id', 'entrezgene', 'external_gene_id',
'chromosome_name', 'start_position', 'end_position')
geneDat = getBM(attributes = getinfo, filters='illumina_humanref_8_v3', values=rownames(datExpr), mart=ensembl)
## Batch submitting query [>-------------------------] 4% eta: 4m Batch
## submitting query [=>------------------------] 6% eta: 5m Batch submitting
## query [=>------------------------] 8% eta: 5m Batch submitting query
## [==>-----------------------] 10% eta: 6m Batch submitting query
## [==>-----------------------] 12% eta: 6m Batch submitting query
## [===>----------------------] 14% eta: 6m Batch submitting query
## [===>----------------------] 16% eta: 6m Batch submitting query
## [====>---------------------] 18% eta: 6m Batch submitting query
## [====>---------------------] 20% eta: 6m Batch submitting query
## [=====>--------------------] 22% eta: 5m Batch submitting query
## [=====>--------------------] 24% eta: 5m Batch submitting query
## [======>-------------------] 27% eta: 5m Batch submitting query
## [======>-------------------] 29% eta: 5m Batch submitting query
## [=======>------------------] 31% eta: 5m Batch submitting query
## [=======>------------------] 33% eta: 5m Batch submitting query
## [========>-----------------] 35% eta: 5m Batch submitting query
## [=========>----------------] 37% eta: 5m Batch submitting query
## [=========>----------------] 39% eta: 5m Batch submitting query
## [==========>---------------] 41% eta: 4m Batch submitting query
## [==========>---------------] 43% eta: 4m Batch submitting query
## [===========>--------------] 45% eta: 4m Batch submitting query
## [===========>--------------] 47% eta: 4m Batch submitting query
## [============>-------------] 49% eta: 4m Batch submitting query
## [============>-------------] 51% eta: 4m Batch submitting query
## [=============>------------] 53% eta: 4m Batch submitting query
## [=============>------------] 55% eta: 3m Batch submitting query
## [==============>-----------] 57% eta: 3m Batch submitting query
## [==============>-----------] 59% eta: 3m Batch submitting query
## [===============>----------] 61% eta: 3m Batch submitting query
## [===============>----------] 63% eta: 3m Batch submitting query
## [================>---------] 65% eta: 3m Batch submitting query
## [=================>--------] 67% eta: 2m Batch submitting query
## [=================>--------] 69% eta: 2m Batch submitting query
## [==================>-------] 71% eta: 2m Batch submitting query
## [==================>-------] 73% eta: 2m Batch submitting query
## [===================>------] 76% eta: 2m Batch submitting query
## [===================>------] 78% eta: 2m Batch submitting query
## [====================>-----] 80% eta: 2m Batch submitting query
## [====================>-----] 82% eta: 1m Batch submitting query
## [=====================>----] 84% eta: 1m Batch submitting query
## [=====================>----] 86% eta: 1m Batch submitting query
## [======================>---] 88% eta: 1m Batch submitting query
## [======================>---] 90% eta: 46s Batch submitting query
## [=======================>--] 92% eta: 37s Batch submitting query
## [=======================>--] 94% eta: 28s Batch submitting query
## [========================>-] 96% eta: 19s Batch submitting query
## [========================>-] 98% eta: 9s Batch submitting query
## [==========================] 100% eta: 0s
idx = match(rownames(datExpr), geneDat$illumina_humanref_8_v3)
datGenes = cbind(rownames(datExpr), geneDat[idx,])
# SFARI Genes
SFARI_genes = read_csv('./../Data/SFARI/SFARI_genes_with_ensembl_IDs.csv') %>%
filter(!is.na(ID))
## Parsed with column specification:
## cols(
## status = col_double(),
## `gene-symbol` = col_character(),
## `gene-name` = col_character(),
## chromosome = col_character(),
## `genetic-category` = col_character(),
## `gene-score` = col_double(),
## syndromic = col_double(),
## `number-of-reports` = col_double(),
## ID = col_character()
## )
# GO Annotations
GO_annotations = read.csv('./../Data/GO_annotations/genes_GO_annotations.csv')
GO_neuronal = GO_annotations %>% filter(grepl('neuron', go_term)) %>%
mutate('ID' = as.character(ensembl_gene_id)) %>%
dplyr::select(-ensembl_gene_id) %>% distinct(ID) %>%
mutate('Neuronal' = 1)
# Combine SFARI and GO information
gene_info = data.frame('ilmnID'=rownames(datExpr), 'ID' = datGenes$ensembl_gene_id) %>%
left_join(SFARI_genes, by='ID') %>%
mutate(`gene-score`=ifelse(is.na(`gene-score`), 'None', `gene-score`)) %>%
left_join(GO_neuronal, by='ID') %>%
mutate('gene.score'=ifelse(`gene-score`=='None' & Neuronal==1, 'Neuronal', `gene-score`)) %>%
mutate('gene.score'=ifelse(is.na(gene.score), 'None', gene.score))
rm(idx, ensembl, getinfo, geneDat, GO_annotations)
RNA-Seq for 33 brain-tissue samples from the PFC, comprising 18 samples from control subjects and 15 from ASD subjects
print(paste0('Dataset includes ', nrow(datExpr), ' genes from ', ncol(datExpr), ' samples belonging to ', length(unique(datMeta$Subject)), ' different subjects.'))
## [1] "Dataset includes 24356 genes from 33 samples belonging to 33 different subjects."
Diagnosis distribution: Fairly balanced
table(datMeta$DX)
##
## Autism Control
## 15 18
Brain region distribution: All samples belong to the PFC
Sex distribution: All samples are males
table(datMeta$SEX)
##
## F M
## 0 33
Age distribution: Subjects between 2 and 56 years old with a mean close to 20
summary(datMeta$AGE)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 2.00 8.00 15.00 20.39 30.00 56.00
Seems like there is a change in behaviour around a mean expression of 9, although we would lose most of the probes if we filtered all genes below that threshold
mean_expr = data.frame('ID'=rownames(datExpr),'Mean'=rowMeans(datExpr, na.rm=T),
'SD'=apply(datExpr,1,function(x) sd(x, na.rm=T)))
p1 = mean_expr %>% ggplot(aes(Mean, SD)) + geom_point(alpha=0.01, color='#0099cc') + geom_smooth(method='lm', color='gray') +
geom_vline(xintercept=6, color='gray', linetype='dashed') + theme_minimal()
p2 = mean_expr %>% ggplot(aes(Mean)) + geom_density(fill='#0099cc', color='#0099cc', alpha=0.5) +
geom_vline(xintercept=6, color='gray', linetype='dashed') + theme_minimal()
grid.arrange(p1, p2, ncol=2)
rm(mean_expr, p1, p2)
Filtering out genes with a mean expression lower than 6
to_keep = rowMeans(datExpr, na.rm=TRUE)>6
datExpr = datExpr[to_keep,]
print(paste0('Removed ', sum(!to_keep, na.rm=T), ', ', sum(to_keep, na.rm=T), ' remaining'))
## [1] "Removed 2530, 21826 remaining"
rm(to_keep)
Using node connectivity as a distance measure, normalising it and filtering out genes farther away than 2 standard deviations from the left (lower connectivity than average, not higher)
Using \(s_{ij}=|bw(i,j)|\) to define connectivity between genes.
Filtering three samples, all beloning to the ASD diagnosis
absadj = datExpr %>% bicor %>% abs
netsummary = fundamentalNetworkConcepts(absadj)
ku = netsummary$Connectivity
z.ku = (ku-mean(ku))/sqrt(var(ku))
plot_data = data.frame('ID'=rownames(datMeta), 'sample'=1:length(z.ku), 'distance'=z.ku, 'Diagnosis'=datMeta$DX)
ggplotly(plot_data %>% ggplot(aes(sample, distance, color=Diagnosis)) + geom_point() +
geom_hline(yintercept=-2, color='gray') + theme_minimal())
print(paste0('Outlier samples: ', paste(as.character(plot_data$ID[plot_data$distance< -2]), collapse=', ')))
## [1] "Outlier samples: 51, 52, 75"
to_keep = z.ku > -2
datMeta = datMeta[to_keep,]
datExpr = datExpr[,to_keep]
print(paste0('Removed ', sum(!to_keep), ' samples, ', sum(to_keep), ' remaining'))
## [1] "Removed 3 samples, 30 remaining"
rm(absadj, netsummary, ku, z.ku, plot_data, to_keep)
print(paste0('After filtering, the dataset consists of ', nrow(datExpr), ' genes and ', ncol(datExpr), ' samples'))
## [1] "After filtering, the dataset consists of 21826 genes and 30 samples"
table(datMeta$Batch, datMeta$DX, useNA='ifany')
##
## Autism Control
## 1 3 4
## 2 8 14
## <NA> 1 0
Remove sample without Batch variable
to_keep = !is.na(datMeta$Batch)
datExpr = datExpr[,to_keep]
datMeta = datMeta[to_keep,]
Using sva instead of svaseq because it’s microarray data
mod = model.matrix(~ DX, datMeta)
mod0 = model.matrix(~ 1, datMeta)
sva_fit = datExpr %>% as.matrix %>% sva(mod=mod, mod0=mod0)
## Number of significant surrogate variables is: 7
## Iteration (out of 5 ):1 2 3 4 5
rm(mod, mod0)
Include SV estimations to datMeta information
sv_data = sva_fit$sv %>% data.frame
colnames(sv_data) = paste0('SV', 1:ncol(sv_data))
datMeta = cbind(datMeta, sv_data)
rm(sv_data)
Using lmFit
mod = model.matrix(~ Batch + SV1 + SV2 + SV3 + SV4 + SV5 + SV6 + SV7 + DX, data=datMeta)
corfit = duplicateCorrelation(datExpr, mod, block=datMeta$SampleID)
lmfit = lmFit(datExpr, design=mod, block=datMeta$sampleid, correlation=corfit$consensus)
fit = eBayes(lmfit, trend=T, robust=T)
top_genes = topTable(fit, coef=2, number=nrow(datExpr)) %>% mutate('ilmnID'=rownames(.))
DE_info = gene_info %>% left_join(top_genes, by='ilmnID')
rm(mod, corfit, lmfit, fit, top_genes)
PCA: Autism samples tend to cluster in the center and there may be an age related pattern in the 1st PC, but neither of these behaviours are very strong
pca = datExpr %>% t %>% prcomp
plot_data = data.frame('ID'=as.numeric(colnames(datExpr)), 'PC1' = pca$x[,1], 'PC2' = pca$x[,2]) %>%
left_join(datMeta, by=c('ID'='SampleID')) %>%
dplyr::select('PC1','PC2','AGE','DX')
selectable_scatter_plot(plot_data, plot_data[,-c(1,2)])
rm(pca, plot_data)
First Principal Component explains 92% of the total variance
There’s a really strong (negative) correlation between the mean expression of a gene and the 1st principal component
pca = datExpr %>% prcomp
plot_data = data.frame('PC1' = pca$x[,1], 'PC2' = pca$x[,2], 'PC3' = pca$x[,3],
'MeanExpr'=rowMeans(datExpr), 'SD'=apply(datExpr,1,sd))
plot_data %>% ggplot(aes(PC1, PC2, color=MeanExpr)) + geom_point(alpha=0.2) + theme_minimal() +
scale_color_viridis() + ggtitle('PCA') +
xlab(paste0('PC1 (',round(100*summary(pca)$importance[2,1],1),'%)')) +
ylab(paste0('PC2 (',round(100*summary(pca)$importance[2,2],1),'%)'))
rm(pca, plot_data)
The higher the SFARI score the higher the mean expression but the lower the standard deviation (although this second relation is weaker). Same as Gandal’s dataset!
plot_data = data.frame('ID'=rownames(datExpr),
'MeanExpr'=rowMeans(datExpr), 'SDExpr'=apply(datExpr,1,sd)) %>%
left_join(DE_info, by=c('ID'='ilmnID'))
p1 = ggplotly(plot_data %>% ggplot(aes(gene.score, MeanExpr, fill=gene.score)) + geom_boxplot() +
scale_fill_manual(values=SFARI_colour_hue(r=c(1:6,8,7))) + theme_minimal() +
theme(legend.position='none'))
p2 = ggplotly(plot_data %>% ggplot(aes(gene.score, SDExpr, fill=gene.score)) + geom_boxplot() +
scale_fill_manual(values=SFARI_colour_hue(r=c(1:6,8,7))) + theme_minimal() +
ggtitle('Mean Expression (left) and SD (right) by SFARI score') +
theme(legend.position='none'))
subplot(p1, p2, nrows=1)
rm(plot_data, p1, p2)
Except for score 6, there doesn’t seem to be any relation between logFC and SFARI score
ggplotly(DE_info %>% ggplot(aes(x=gene.score, y=abs(logFC), fill=gene.score)) +
geom_boxplot() + scale_fill_manual(values=SFARI_colour_hue(r=c(1:6,8,7))) +
theme_minimal() + theme(legend.position='none'))
## Warning: Removed 2530 rows containing non-finite values (stat_boxplot).
save(datExpr, datMeta, datGenes, DE_info, file='./../Data/Chow/cleaned_data.RData')
#load('./../Data/Gupta/preprocessed_data.RData')
sessionInfo()
## R version 3.5.2 (2018-12-20)
## Platform: x86_64-redhat-linux-gnu (64-bit)
## Running under: Scientific Linux 7.6 (Nitrogen)
##
## Matrix products: default
## BLAS/LAPACK: /usr/lib64/R/lib/libRblas.so
##
## locale:
## [1] LC_CTYPE=en_GB.UTF-8 LC_NUMERIC=C
## [3] LC_TIME=en_GB.UTF-8 LC_COLLATE=en_GB.UTF-8
## [5] LC_MONETARY=en_GB.UTF-8 LC_MESSAGES=en_GB.UTF-8
## [7] LC_PAPER=en_GB.UTF-8 LC_NAME=C
## [9] LC_ADDRESS=C LC_TELEPHONE=C
## [11] LC_MEASUREMENT=en_GB.UTF-8 LC_IDENTIFICATION=C
##
## attached base packages:
## [1] stats4 parallel stats graphics grDevices utils datasets
## [8] methods base
##
## other attached packages:
## [1] illuminaHumanv4.db_1.26.0 org.Hs.eg.db_3.7.0
## [3] AnnotationDbi_1.44.0 IRanges_2.16.0
## [5] S4Vectors_0.20.1 limma_3.38.3
## [7] glue_1.3.1 sva_3.30.1
## [9] BiocParallel_1.16.6 genefilter_1.64.0
## [11] mgcv_1.8-26 nlme_3.1-137
## [13] vsn_3.50.0 Biobase_2.42.0
## [15] BiocGenerics_0.28.0 WGCNA_1.66
## [17] fastcluster_1.1.25 dynamicTreeCut_1.63-1
## [19] forcats_0.3.0 stringr_1.4.0
## [21] dplyr_0.8.0.1 purrr_0.3.1
## [23] readr_1.3.1 tidyr_0.8.3
## [25] tibble_2.1.1 tidyverse_1.2.1
## [27] viridis_0.5.1 viridisLite_0.3.0
## [29] gridExtra_2.3 plotlyutils_0.0.0.9000
## [31] plotly_4.8.0 ggplot2_3.1.0
## [33] biomaRt_2.38.0
##
## loaded via a namespace (and not attached):
## [1] colorspace_1.4-1 htmlTable_1.11.2 base64enc_0.1-3
## [4] rstudioapi_0.7 affyio_1.52.0 bit64_0.9-7
## [7] mvtnorm_1.0-7 lubridate_1.7.4 xml2_1.2.0
## [10] codetools_0.2-15 splines_3.5.2 doParallel_1.0.14
## [13] impute_1.56.0 robustbase_0.93-0 knitr_1.22
## [16] Formula_1.2-3 jsonlite_1.6 Cairo_1.5-9
## [19] annotate_1.60.1 broom_0.5.1 cluster_2.0.7-1
## [22] GO.db_3.7.0 shiny_1.2.0 BiocManager_1.30.4
## [25] rrcov_1.4-3 compiler_3.5.2 httr_1.4.0
## [28] backports_1.1.2 assertthat_0.2.1 Matrix_1.2-15
## [31] lazyeval_0.2.2 cli_1.1.0 later_0.8.0
## [34] acepack_1.4.1 htmltools_0.3.6 prettyunits_1.0.2
## [37] tools_3.5.2 gtable_0.2.0 affy_1.60.0
## [40] Rcpp_1.0.1 cellranger_1.1.0 preprocessCore_1.44.0
## [43] crosstalk_1.0.0 iterators_1.0.9 xfun_0.5
## [46] rvest_0.3.2 mime_0.6 statmod_1.4.30
## [49] XML_3.98-1.11 DEoptimR_1.0-8 MASS_7.3-51.1
## [52] zlibbioc_1.28.0 scales_1.0.0 promises_1.0.1
## [55] hms_0.4.2 RColorBrewer_1.1-2 curl_3.3
## [58] yaml_2.2.0 memoise_1.1.0 rpart_4.1-13
## [61] latticeExtra_0.6-28 stringi_1.4.3 RSQLite_2.1.1
## [64] pcaPP_1.9-73 foreach_1.4.4 checkmate_1.8.5
## [67] rlang_0.3.2 pkgconfig_2.0.2 matrixStats_0.54.0
## [70] bitops_1.0-6 evaluate_0.13 lattice_0.20-38
## [73] labeling_0.3 htmlwidgets_1.3 bit_1.1-14
## [76] tidyselect_0.2.5 robust_0.4-18 plyr_1.8.4
## [79] magrittr_1.5 R6_2.4.0 generics_0.0.2
## [82] Hmisc_4.1-1 fit.models_0.5-14 DBI_1.0.0
## [85] pillar_1.3.1 haven_1.1.1 foreign_0.8-71
## [88] withr_2.1.2 survival_2.43-3 RCurl_1.95-4.10
## [91] nnet_7.3-12 modelr_0.1.4 crayon_1.3.4
## [94] rmarkdown_1.12 progress_1.2.0 grid_3.5.2
## [97] readxl_1.1.0 data.table_1.12.0 blob_1.1.1
## [100] digest_0.6.18 xtable_1.8-3 httpuv_1.5.0
## [103] munsell_0.5.0